#!/bin/bash # This is a converter script to convert the content from a zip file to a single txt file. # All files which extensions are defined in this script get unzipped, converted to text and joined to one single output file # usage: zip2txt.sh #adapt this: #Folder where the zip file is unpacked WARNING: DO NOT USE THIS FOLDER FOR ANYTHING ELSE -> all files in there will be converted! TMPFOLDER="/tmp/zipconverter" #File which is used as a temporary storage #DO NOT PLACE THE TMPFILE INSIDE/BELOW THE TMPFOLDER IF YOU DON'T EXACTLY KNOW WHAT YOUR ARE DOING TMPFILE="/tmp/zipconverstion.txt" #commands needed for this script UNZIP_CMD="/usr/bin/unzip" FIND_CMD="/usr/bin/find" #extent the extention and command array for your personal needs #note: the first parameter of the cmd must be the input, the second is the output filename. e.g. /opt/office2txt.sh FILEEXT[0]="doc"; CMD[0]="/opt/office2txt.sh" FILEEXT[1]="pdf"; CMD[1]="/usr/bin/pdftotext" #IO definitions zipfile=$1 outputfile=$2 #generate filter string from FILEEXT filter="" for ext in "${FILEEXT[@]}" do filter="$filter *.$ext" done #Unzip only content into TMPFOLDER with known extensions, ignoring case sensitivity of filter "-C", # The "-P \n" is needed to tell unzip that we do not have a valid password so it does not ask on stdin # if a file is encrypted $UNZIP_CMD -o -qq -C -P \n $1$filter -d $TMPFOLDER #put all filenames into an array which are inside the TMPFOLDER. #Whitespaces in filenames are handled correctly (from http://mywiki.wooledge.org/BashFAQ/020) unset filenames i while IFS= read -r -d '' file; do filenames[i++]=$file # echo "File: ${filenames[i-1]}" done < <($FIND_CMD $TMPFOLDER -type f -print0) #switch off case sensitivity shopt -s nocasematch #convert each file to txt according the command set in CMD for file in "${filenames[@]}" do echo "Working on file: $file" #get fileextention input_filename_w_ext=$(basename "$file") input_extension=${input_filename_w_ext##*.} #search extension in FILEEXT array (case insensitive) # get length of an array tLen=${#FILEEXT[@]} extfount=0 for (( i=0; i<${tLen}; i++ )); do if [[ ${FILEEXT[$i]} = $input_extension ]] then rm -f $TMPFILE #make sure it is empty #execute conversion cmd echo ${CMD[$i]} "$file" "$TMPFILE" ${CMD[$i]} "$file" "$TMPFILE" #append $TMPFILE to output file $outputfile cat $TMPFILE >> $outputfile break fi done done #switch on case sensitivity shopt -u nocasematch #remove all stuff in the temp folder and the temp file rm -rf $TMPFOLDER/* rm -f $TMPFILE